Purpose: Revenue vs AdSpend effect

Author: Gerardo Gandara

Contact:

Client:

Code created: 2023-10-11

Last updated: 2023-10-11

Source: https://github.com/ggandara13/myrepository

Comment: https://getrecast.com/modern-media-mix-modeling/.

Libraries

These are the libraries to load

pkg <- c('pivottabler', 'tidyverse','tibbletime','anomalize','timetk')
#install.packages(pkg)

library(pivottabler)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(anomalize)
library(timetk)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Read the revenue and AdSpend

We check the date type of the ‘date’ column and change the type to DATE

The CSV data is avaialble in the github repository

setwd("/Users/gerardogandara/Documents/recast/")

## Read file as CSV
df_spend <- read.csv('https://raw.githubusercontent.com/ggandara13/myrepository/master/acme_spend.csv')
df_revenue <- read.csv('https://raw.githubusercontent.com/ggandara13/myrepository/master/acme_revenue.csv')

# Convert char format to DATE format
df_revenue$date <- as.Date(df_revenue$date, format =  "%m/%d/%y")
df_spend$date <- as.Date(df_spend$date, format =  "%m/%d/%y")

# Check the format and content of each dataframe
str(df_revenue)
## 'data.frame':    1035 obs. of  5 variables:
##  $ X              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ date           : Date, format: "2020-01-01" "2020-01-02" ...
##  $ revenue_dtc    : num  16000 19306 22052 23177 26169 ...
##  $ revenue_amazon : num  50000 60332 68914 72428 81778 ...
##  $ revenue_walmart: num  34000 41026 46861 49251 55609 ...
str(df_spend)
## 'data.frame':    10350 obs. of  4 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ date   : Date, format: "2020-01-01" "2020-01-01" ...
##  $ channel: chr  "facebook_prospecting" "facebook_retargeting" "google_branded_search" "google_nonbranded_search" ...
##  $ spend  : num  4154 2679 474 3817 7421 ...

Check there are not NULL values

paste("Total of NULL in Spend:", sum(is.na(df_spend)))
## [1] "Total of NULL in Spend: 69"
paste("\nTotal of NULL in Revenue:", sum(is.na(df_revenue)))
## [1] "\nTotal of NULL in Revenue: 5"
paste("\n\nSummary\n")
## [1] "\n\nSummary\n"
# you can check with summary as well and see the columns
summary(df_spend)
##        X              date              channel              spend      
##  Min.   :    1   Min.   :2020-01-01   Length:10350       Min.   :    0  
##  1st Qu.: 2588   1st Qu.:2020-09-15   Class :character   1st Qu.: 1323  
##  Median : 5176   Median :2021-06-01   Mode  :character   Median : 5358  
##  Mean   : 5176   Mean   :2021-06-01                      Mean   : 7792  
##  3rd Qu.: 7763   3rd Qu.:2022-02-15                      3rd Qu.:11711  
##  Max.   :10350   Max.   :2022-10-31                      Max.   :36401  
##                                                          NA's   :69
summary(df_revenue)
##        X               date             revenue_dtc     revenue_amazon  
##  Min.   :   1.0   Min.   :2020-01-01   Min.   : 16000   Min.   : 50000  
##  1st Qu.: 259.5   1st Qu.:2020-09-15   1st Qu.: 41685   1st Qu.:130464  
##  Median : 518.0   Median :2021-06-01   Median : 72121   Median :225380  
##  Mean   : 518.0   Mean   :2021-06-01   Mean   : 72882   Mean   :227502  
##  3rd Qu.: 776.5   3rd Qu.:2022-02-14   3rd Qu.: 97433   3rd Qu.:303896  
##  Max.   :1035.0   Max.   :2022-10-31   Max.   :193005   Max.   :415952  
##                                        NA's   :2        NA's   :2       
##  revenue_walmart 
##  Min.   : 34000  
##  1st Qu.: 88614  
##  Median :153184  
##  Mean   :154638  
##  3rd Qu.:206464  
##  Max.   :282848  
##  NA's   :1
sum(is.na(df_spend))/nrow(df_spend)*100
## [1] 0.6666667
sum(is.na(df_revenue))/nrow(df_revenue)*100
## [1] 0.4830918

Since we have 1,035 rows, we can remove because that represents less than 1% or just replace with zero

Now, let’s replace with zero since it is a time series

df_revenue[is.na(df_revenue)] <- 0
df_spend[is.na(df_spend)] <- 0


cat("Total of NULL in Spend:", sum(is.na(df_spend)))
## Total of NULL in Spend: 0
cat("\nTotal of NULL in Revenue:", sum(is.na(df_revenue)))
## 
## Total of NULL in Revenue: 0

1- Which channel had the largest increase in spend in 2022 compared to the same date range in 2021?

Using lubridate to filter the year of the date, also use the pivottabler library to group/aggregate it

Answer: online_video

yearfunction<-function(dataframe, datecolumn) {
    year(dataframe[,datecolumn])
}
df_spend$Year <- yearfunction(df_spend, "date")


df <- df_spend %>% group_by(channel) %>% 
  filter(lubridate::year(date) %in% c(2021, 2022) )
df <- as.data.frame(df)


# initialice the PIVOT
pt <- PivotTable$new()
# add the df with 2 years
pt$addData(df)
pt$addColumnDataGroups("Year")
pt$addRowDataGroups("channel")
pt$defineCalculation(calculationName="TotalSpend", summariseExpression="sum(spend)")
pt$evaluatePivot()

#covnert to dataframe
df1 <- pt$asDataFrame()
df_summary_final <- df1 %>%
  mutate(index_lift = ( (df1[,2]/ df1[,1]) - 1 ) *100  )

df_summary_final<- df_summary_final[order(df_summary_final$index_lift, decreasing = TRUE), ]


# print the first value of the dataframe
paste("The channel with highest LIFT is:", row.names(df_summary_final)[1] )
## [1] "The channel with highest LIFT is: online_video"

2- In terms of total revenue, are there any anomalous days?

Group by day to see the trend and plot

df_revenue_total <- df_revenue %>%
  mutate(total_revenue = ( df_revenue$revenue_dtc + df_revenue$revenue_amazon + df_revenue$revenue_walmart  ))

# Group by sum using R Base aggregate()
agg_df <- aggregate(df_revenue_total$total_revenue, by=list(df_revenue_total$date), FUN=sum)
colnames(agg_df) <- c('date','revenue') 


#Plot sales over 36 months
ggplot(agg_df, aes(x=date, y = revenue)) + geom_line()

paste("Visually we can see some outliers")
## [1] "Visually we can see some outliers"

Now, we can use the anomalties library to identify the values

Use time_decompose() to decompose a time series prior to performing anomaly detection with anomalize(). Typically, anomalize() is performed on the “remainder” of the time series decomposition.

The return has three columns: “remainder_l1” (lower limit for anomalies), “remainder_l2” (upper limit for anomalies), and “anomaly” (Yes/No).

# Convert df to a tibble
df <- as_tibble(agg_df)
class(df)
## [1] "tbl_df"     "tbl"        "data.frame"
df_anomalized <- df %>%
    time_decompose(revenue, method = "stl", merge = TRUE) %>%
    anomalize(remainder, method = "iqr", alpha = 0.05, max_anoms = 0.2) %>%
    time_recompose()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = date
## frequency = 7 days
## trend = 91 days
#We can then visualize the anomalies using the plot_anomalies() function. 
df_anomalized %>% plot_anomalies(ncol = 1, alpha_dots = 0.75)

We can print the detail of the anomalies, for example the last 4

tail(df_anomalized %>%
  filter(anomaly == 'Yes'),4)

You can also use the timetk package for dynamic plot

agg_df %>% timetk::plot_anomaly_diagnostics(date,revenue, .facet_ncol = 2)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months

3- In which month of the year does Acme tend to make the most revenue?

We can compute month column before we aggregate and sort to get the popular month

Most profitale month in a year is the month number: 3

# Calculate the month column
monthfunction<-function(dataframe, datecolumn) {
    month(dataframe[,datecolumn])
}
agg_df$Month <- monthfunction(agg_df, "date")

# group by month

by_month <- aggregate(agg_df$revenue, by = list(agg_df$Month), FUN = sum)
colnames(by_month) <- c('month','revenue') 



by_month <- by_month[order(by_month$revenue, decreasing = TRUE), ]

# print the first value of the dataframe
print(by_month)
##    month  revenue
## 3      3 58922686
## 1      1 57540788
## 2      2 57270462
## 4      4 51753505
## 5      5 44977988
## 12    12 41153803
## 10    10 33736751
## 11    11 30650581
## 6      6 29816422
## 9      9 23007186
## 7      7 21236448
## 8      8 20124698
paste("Most profitale month in a year is the month number:", row.names(by_month)[1] )
## [1] "Most profitale month in a year is the month number: 3"

4- Does Acme’s marketing spend tend to follow a similar pattern to revenue?

We need to merge both dataframes to have the TS of Revenue using AdSped as regressor Convert the dataframe to TS then plot, then plot the results

Visually we can confirm they have a similar pattern.

#select the date and total_revenue of Acme
df_only_revenue <- select(df_revenue_total, date, total_revenue)

#aggregate spend of all media
df_spend_total <- aggregate(df_spend$spend, by=list(df_spend$date), FUN=sum)
colnames(df_spend_total) <- c('date','spend') 

df_rev_spend <- merge(df_only_revenue, df_spend_total, by = "date")


#Convert dataframe to time series object using the ts() function
acme_ts <- ts(data = df_rev_spend[,c(2,3)])


# Time plot of both variables
autoplot(acme_ts, facets = TRUE)